#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import re
import sys
from unicodedata import category as cat
import jieba

def tbl_p_list():
    """
    生成中文及英文标点列表
    """
    tbl_p = [chr(i) for i in range(sys.maxunicode) if cat(chr(i)).startswith('P')]
    return tbl_p


def load_jiaba(user_dict_jieba_path):
    #jieba.add_word('ＩＰ')
    jieba.load_userdict(user_dict_jieba_path)                     #加载自定义词典  
    jieba.set_dictionary(user_dict_jieba_path) 
    jieba.initialize()
    print('加载自定义词典成功')

def ws_jieba(_text_,stopwords):
    """
    输入文本和中文停词表
    """
    stopwords = stopwords
    ## preprocessing, removing whitespaces(multiple) to 1 
    _text_ = re.sub( r'\s+', " ", _text_)

    word_sentence_list = list(jieba.cut(_text_))
    outcomes = word_sentence_list
    # remove white-space
    outcomes = [x.strip() for x in outcomes if x not in stopwords+[' ']]  # 把stopwords 及 " "过滤掉
    return(outcomes)

def submissions_BOW_JB(df,dfc,stopwords):
    """
    整合原表特定栏位，过滤掉标点
    输入文本和中文停词表
    """
    dfc['BOW_JB'] = [ws_jieba(  df.doc[i] ,stopwords ) for i in dfc.index]
    return dfc


